"""
@Description: 写来过滤掉404页面，目前只有CSDN过滤了
@Author: Ziheng Xu
@Date: 2022/10/21 16:33
"""
import time
from selenium import webdriver
import json
import random


def sleep_random_interval(start, end):
    time.sleep(random.randint(start, end))

app_name = "Toutiao"
fd = open("./Url_dir/Toutiao_1-300.txt")
urls = fd.readlines()
fd.close()
cnt = 0

invalid_url = open("./invalid_url.txt", "w")

with open("./config.json", "r", encoding="utf-8") as fd:
    config = json.load(fd)
    webdriver_path = config[app_name]["webdriver_path"]

browser_handle = webdriver.Chrome(executable_path=webdriver_path)
browser_handle.implicitly_wait(10)
browser_handle.get(config[app_name]["index_url"])
time.sleep(5)
cnt = 0
for url in urls:
    cnt += 1
    print("Current Processing URL index: " + str(cnt))
    new_tab = "window.open(\"{}\")".format(url.strip())
    browser_handle.execute_script(new_tab)
    # 模拟浏览的时间
    sleep_random_interval(3, 5)
    browser_handle.switch_to.window(browser_handle.window_handles[1])
    if "404" in browser_handle.title:
        print(url)
        invalid_url.write(url)
    browser_handle.close()
    # 切换回主页
    browser_handle.switch_to.window(browser_handle.window_handles[0])
    # 主页停留时间
    sleep_random_interval(1, 2)

invalid_url.close()

